In this notebook I download and unzip the Ford Go Bike data.

#rm(list = ls())

library(pacman)
p_load(tidyverse, tictoc, ggmap, skimr, lubridate, forcats, biganalytics, doParallel)

#tictoc: tic() before chunk and toc() after provides how long a chunc of code takes to run 

Downloading the data directly from https://s3.amazonaws.com/fordgobike-data

#2017 includes all months into a single file

URL <- "https://s3.amazonaws.com/fordgobike-data/2017-fordgobike-tripdata.csv"
download.file(URL, destfile = "./data/2017-fordgobike-tripdata.csv", method="curl")

#in 2018, data was seperated by month so I loop over to download everything from January to Jully

for (i in 1:7) {
URL <- paste0("https://s3.amazonaws.com/fordgobike-data/20180",i,"-fordgobike-tripdata.csv.zip")
download.file(URL, destfile = paste0("./data/20180",i,"-fordgobike-tripdata.csv.zip"), method="curl")
}

Unzip downloaded files.

unzip("./data/201801-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201801-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201802-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201802-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201803-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201803-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201804-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201804-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201805-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201805-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201806-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201806-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file
unzip("./data/201807-fordgobike-tripdata.csv.zip",exdir="./data")
## Warning in unzip("./data/201807-fordgobike-tripdata.csv.zip", exdir = "./data"):
## error 1 in extracting from zip file

Clean up data directory.

fn <- "./data/201801-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201802-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201803-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201804-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201805-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201806-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)
fn <- "./data/201807-fordgobike-tripdata.csv.zip"
if (file.exists(fn)) file.remove(fn)

Read the.csv files

fordgobike2017 <- read_csv(file="./data/2017-fordgobike-tripdata.csv")
fordgobike201801 <- read_csv(file="./data/201801-fordgobike-tripdata.csv")
fordgobike201802 <- read_csv(file="./data/201802-fordgobike-tripdata.csv")
fordgobike201803 <- read_csv(file="./data/201803-fordgobike-tripdata.csv")
fordgobike201804 <- read_csv(file="./data/201804-fordgobike-tripdata.csv")
fordgobike201805 <- read_csv(file="./data/201805-fordgobike-tripdata.csv")
fordgobike201806 <- read_csv(file="./data/201806-fordgobike-tripdata.csv")
fordgobike201807 <- read_csv(file="./data/201807-fordgobike-tripdata.csv")

Check the head() of the loaded data.frames

head(fordgobike2017,3) 
head(fordgobike201801,3)
head(fordgobike201802,3)
head(fordgobike201803,3)
head(fordgobike201804,3)
head(fordgobike201805,3)
head(fordgobike201806,3)
head(fordgobike201807,3)

Check the tail() of the loaded data.frames.

tail(fordgobike2017,3)
tail(fordgobike201801,3)
tail(fordgobike201802,3)
tail(fordgobike201803,3)
tail(fordgobike201804,3)
tail(fordgobike201805,3)
tail(fordgobike201806,3)
tail(fordgobike201807,3)

Check the dimension (number of rows and columns) of the data

dim(fordgobike2017)
## [1] 519700     15
dim(fordgobike201801)
## [1] 94802    16
dim(fordgobike201802)
## [1] 106718     16
dim(fordgobike201803)
## [1] 111382     16
dim(fordgobike201804)
## [1] 131169     16
dim(fordgobike201805)
## [1] 179125     16
dim(fordgobike201806)
## [1] 195968     16
dim(fordgobike201807)
## [1] 199222     16

Change the data types in 2017 variables to be uniform and then merge everything

fordgobike201806 <- fordgobike201806 %>%
        mutate(start_station_id = as.integer(start_station_id),
               end_station_id= as.integer(end_station_id) )   
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
fordgobike201807 <- fordgobike201807 %>%
        mutate(start_station_id = as.integer(start_station_id),
               end_station_id= as.integer(end_station_id) )
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
fordgobike2018 <- bind_rows(fordgobike201801, fordgobike201802, fordgobike201803, 
                            fordgobike201804,fordgobike201805, fordgobike201806,
                            fordgobike201807)

glimpse(fordgobike2018)
## Observations: 1,018,386
## Variables: 16
## $ duration_sec            <dbl> 75284, 85422, 71576, 61076, 39966, 6477, 453,…
## $ start_time              <dttm> 2018-01-31 22:52:35, 2018-01-31 16:13:34, 20…
## $ end_time                <dttm> 2018-02-01 19:47:19, 2018-02-01 15:57:17, 20…
## $ start_station_id        <dbl> 120, 15, 304, 75, 74, 236, 110, 81, 134, 305,…
## $ start_station_name      <chr> "Mission Dolores Park", "San Francisco Ferry …
## $ start_station_latitude  <dbl> 37.76142, 37.79539, 37.34876, 37.77379, 37.77…
## $ start_station_longitude <dbl> -122.4264, -122.3942, -121.8948, -122.4212, -…
## $ end_station_id          <dbl> 285, 15, 296, 47, 19, 160, 134, 93, 4, 317, 4…
## $ end_station_name        <chr> "Webster St at O'Farrell St", "San Francisco …
## $ end_station_latitude    <dbl> 37.78352, 37.79539, 37.32600, 37.78095, 37.78…
## $ end_station_longitude   <dbl> -122.4312, -122.3942, -121.8771, -122.3997, -…
## $ bike_id                 <dbl> 2765, 2815, 3039, 321, 617, 1306, 3571, 1403,…
## $ user_type               <chr> "Subscriber", "Customer", "Customer", "Custom…
## $ member_birth_year       <dbl> 1986, NA, 1996, NA, 1991, NA, 1988, 1980, 198…
## $ member_gender           <chr> "Male", NA, "Male", NA, "Male", NA, "Male", "…
## $ bike_share_for_all_trip <chr> "No", "No", "No", "No", "No", "No", "No", "No…

Merge 2017 and 2018 data

#check the dimension first
dim(fordgobike2017)
## [1] 519700     15
dim(fordgobike2018)
## [1] 1018386      16
fordgobike <- bind_rows(fordgobike2017, fordgobike2018)
head(fordgobike, 3)
#output the data as a csv file
write.csv(fordgobike, file = "./data/fordgobike.csv")

dim(fordgobike)
## [1] 1538086      16

create new variable for “age”, “year”, “month”, and “day”

fordgobike <- fordgobike %>% mutate(age = year(now()) - member_birth_year, year=year(start_time), month=month(start_time), day=day(start_time))

head(fordgobike,3)
dim(fordgobike)
## [1] 1538086      20

Create a new variable “weekday”

fordgobike <- fordgobike %>% mutate(week_day = wday(start_time) )

levels <- c("M","T","W","TH","F","SAT","SUN")

fordgobike$week_day <- factor(fordgobike$week_day, levels = levels)

head(fordgobike, 3)
dim(fordgobike)
## [1] 1538086      21

Distribution of riders by “age”, filter riders below 81, 101, and above 100.

fordgobike %>% group_by(age) %>% count()
fordgobike %>% group_by(age) %>% summary()
##   duration_sec       start_time                     end_time                  
##  Min.   :   61.0   Min.   :2017-06-28 09:47:36   Min.   :2017-06-28 09:52:55  
##  1st Qu.:  361.0   1st Qu.:2017-11-14 10:08:31   1st Qu.:2017-11-14 10:21:12  
##  Median :  569.0   Median :2018-03-15 07:10:23   Median :2018-03-15 07:24:04  
##  Mean   :  957.4   Mean   :2018-02-22 12:28:46   Mean   :2018-02-22 12:44:43  
##  3rd Qu.:  897.0   3rd Qu.:2018-06-02 17:56:46   3rd Qu.:2018-06-02 18:19:06  
##  Max.   :86369.0   Max.   :2018-07-31 23:57:19   Max.   :2018-08-01 11:00:22  
##                                                                               
##  start_station_id start_station_name start_station_latitude
##  Min.   :  3.0    Length:1538086     Min.   :37.31         
##  1st Qu.: 28.0    Class :character   1st Qu.:37.77         
##  Median : 79.0    Mode  :character   Median :37.78         
##  Mean   :107.7                       Mean   :37.77         
##  3rd Qu.:173.0                       3rd Qu.:37.80         
##  Max.   :357.0                       Max.   :45.51         
##  NA's   :5245                                              
##  start_station_longitude end_station_id  end_station_name  
##  Min.   :-122.44         Min.   :  3.0   Length:1538086    
##  1st Qu.:-122.41         1st Qu.: 27.0   Class :character  
##  Median :-122.40         Median : 77.0   Mode  :character  
##  Mean   :-122.36         Mean   :105.6                     
##  3rd Qu.:-122.39         3rd Qu.:171.0                     
##  Max.   : -73.57         Max.   :357.0                     
##                          NA's   :5245                      
##  end_station_latitude end_station_longitude    bike_id      user_type        
##  Min.   :37.28        Min.   :-122.44       Min.   :  10   Length:1538086    
##  1st Qu.:37.77        1st Qu.:-122.41       1st Qu.:1045   Class :character  
##  Median :37.78        Median :-122.40       Median :2072   Mode  :character  
##  Mean   :37.77        Mean   :-122.35       Mean   :2021                     
##  3rd Qu.:37.80        3rd Qu.:-122.39       3rd Qu.:2952                     
##  Max.   :45.51        Max.   : -73.57       Max.   :4307                     
##                                                                              
##  member_birth_year member_gender      bike_share_for_all_trip      age        
##  Min.   :1881      Length:1538086     Length:1538086          Min.   : 20.0   
##  1st Qu.:1976      Class :character   Class :character        1st Qu.: 31.0   
##  Median :1984      Mode  :character   Mode  :character        Median : 36.0   
##  Mean   :1982                                                 Mean   : 38.2   
##  3rd Qu.:1989                                                 3rd Qu.: 44.0   
##  Max.   :2000                                                 Max.   :139.0   
##  NA's   :137667                                               NA's   :137667  
##       year          month           day           week_day      
##  Min.   :2017   Min.   : 1.0   Min.   : 1.00   M      :      0  
##  1st Qu.:2017   1st Qu.: 4.0   1st Qu.: 8.00   T      :      0  
##  Median :2018   Median : 6.0   Median :16.00   W      :      0  
##  Mean   :2018   Mean   : 6.3   Mean   :15.98   TH     :      0  
##  3rd Qu.:2018   3rd Qu.: 9.0   3rd Qu.:24.00   F      :      0  
##  Max.   :2018   Max.   :12.0   Max.   :31.00   (Other):      0  
##                                                NA's   :1538086
skim(fordgobike)
Data summary
Name fordgobike
Number of rows 1538086
Number of columns 21
_______________________
Column type frequency:
character 5
factor 1
numeric 13
POSIXct 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
start_station_name 0 1.00 4 63 0 316 0
end_station_name 0 1.00 4 63 0 316 0
user_type 0 1.00 8 10 0 2 0
member_gender 137326 0.91 4 6 0 3 0
bike_share_for_all_trip 519700 0.66 2 3 0 2 0

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
week_day 1538086 0 FALSE 0 M: 0, T: 0, W: 0, TH: 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
duration_sec 0 1.00 957.38 2891.83 61.00 361.00 569.00 897.00 86369.00 ▇▁▁▁▁
start_station_id 5245 1.00 107.70 92.97 3.00 28.00 79.00 173.00 357.00 ▇▅▂▂▁
start_station_latitude 0 1.00 37.77 0.10 37.31 37.77 37.78 37.80 45.51 ▇▁▁▁▁
start_station_longitude 0 1.00 -122.36 0.15 -122.44 -122.41 -122.40 -122.39 -73.57 ▇▁▁▁▁
end_station_id 5245 1.00 105.63 92.60 3.00 27.00 77.00 171.00 357.00 ▇▃▂▂▁
end_station_latitude 0 1.00 37.77 0.10 37.28 37.77 37.78 37.80 45.51 ▇▁▁▁▁
end_station_longitude 0 1.00 -122.35 0.15 -122.44 -122.41 -122.40 -122.39 -73.57 ▇▁▁▁▁
bike_id 0 1.00 2020.60 1152.29 10.00 1045.00 2072.00 2952.00 4307.00 ▇▇▇▇▅
member_birth_year 137667 0.91 1981.80 10.56 1881.00 1976.00 1984.00 1989.00 2000.00 ▁▁▁▂▇
age 137667 0.91 38.20 10.56 20.00 31.00 36.00 44.00 139.00 ▇▂▁▁▁
year 0 1.00 2017.66 0.47 2017.00 2017.00 2018.00 2018.00 2018.00 ▅▁▁▁▇
month 0 1.00 6.30 3.06 1.00 4.00 6.00 9.00 12.00 ▆▆▇▃▅
day 0 1.00 15.98 8.78 1.00 8.00 16.00 24.00 31.00 ▇▇▇▇▇

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
start_time 0 1 2017-06-28 09:47:36 2018-07-31 23:57:19 2018-03-15 07:10:23 1538011
end_time 0 1 2017-06-28 09:52:55 2018-08-01 11:00:22 2018-03-15 07:24:04 1538010
fordgobike %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fordgobike %>% filter(age <= 100) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fordgobike %>% filter(age > 100) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Group by “gender”, “age” (below 81) and plot their histograms

fordgobike %>% group_by( member_gender, age ) %>% count()
fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 137667 rows containing non-finite values (stat_bin).

fordgobike %>% ggplot(aes(x=age, class=member_gender)) + geom_histogram(aes(y=..density..))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 137667 rows containing non-finite values (stat_bin).

fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot histograms of people below 81 years old and facet by gender with relative colors

fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + 
  geom_histogram(position="identity") + 
  facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fordgobike %>% filter(age <= 80) %>% ggplot(aes(x=age, color=member_gender)) + 
  geom_histogram(aes(y=..density..),position="identity") + 
  facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plotting the “Year”, “month”, and day of week.

fordgobike %>% ggplot(aes(x=year)) + geom_bar()

fordgobike %>% ggplot(aes(x=month)) + geom_bar() + facet_grid(year ~ .)

fordgobike %>% ggplot(aes(x=day)) + geom_bar() + facet_grid(year ~ .)

Removing geocode outliers, subset longitude and latitude, and plot the riders’ location in the Bay Area

fordgobike2018 <- fordgobike2018 %>% filter(start_station_latitude < 38 & start_station_longitude < 120 )

fordgobike_subset <- fordgobike2018 %>% select(start_station_longitude,start_station_latitude)

fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
  geom_point()

store the data as matrix, create a k-means cluster for each location (Oakland, San Jose, San francisco), and map them

tic()
registerDoParallel(cores = 8)

head(fordgobike2018)
fordgobike_subset2 <- as.matrix(fordgobike_subset)

set.seed <- 123454321

cl <- bigkmeans(fordgobike_subset2, 3, nstart=8)

cl$centers
##           [,1]     [,2]
## [1,] -121.8953 37.34168
## [2,] -122.2660 37.83117
## [3,] -122.4072 37.77809
fordgobike2018 <- fordgobike2018 %>% 
  mutate(clust = cl$cluster)

fordgobike_subset %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude, color=cl$cluster)) +
  geom_point()

toc()
## 60.854 sec elapsed

Plotting the stations

# https://stackoverflow.com/questions/20621250/simple-approach-to-assigning-clusters-for-new-data-after-k-means-clustering

cl$centers
##           [,1]     [,2]
## [1,] -121.8953 37.34168
## [2,] -122.2660 37.83117
## [3,] -122.4072 37.77809
closest.cluster <- function(x) {
  cluster.dist <- apply(cl$centers, 1, function(y) sqrt(sum((x-y)^2)))
  return(which.min(cluster.dist)[1])
}

oak <- closest.cluster(c(-122.2711, 37.8044))
sj <- closest.cluster(c(-121.8953, 37.34168))
sf <- closest.cluster(c(-122.4072, 37.77809))

#Oakland stations
oakland <- fordgobike2018 %>% filter(clust == oak) 
oakland %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
  geom_point() +
  ggtitle("Oakland Ford Go Bike stations")

#San Jose  stations
san_jose <- fordgobike2018 %>% filter(clust == sj) 
san_jose %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
  geom_point() +
  ggtitle("San Jose Ford Go Bike stations")

#San Francisco stations
san_francisco <- fordgobike2018 %>% filter(clust == sf) 
san_francisco %>% ggplot(aes(x=start_station_longitude, y=start_station_latitude)) +
  geom_point() +
  ggtitle("San Francisco Ford Go Bike stations")

Register with Google Maps and locate bike stations

#register_google(key = "XXXXXXXXXXXXXXXXX-XXXXXXXXXX", write = TRUE)

#Oakland
get_map(location = c(lon=cl$centers[oak,1], lat=cl$centers[oak,2]), zoom = 12, maptype = "roadmap") %>% ggmap() +
  geom_point(data = oakland, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) +
  ggtitle("Oakland Ford Go Bike stations")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.831171,-122.26603&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx-mUUagmuvZ5bE

#San Jose
get_map(location = c(lon=cl$centers[sj,1], lat=cl$centers[sj,2]), zoom = 12, maptype = "roadmap") %>% ggmap() +
  geom_point(data = san_jose, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) +
  ggtitle("San Jose Ford Go Bike stations")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.341677,-121.895287&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx-mUUagmuvZ5bE
## Warning: Removed 8 rows containing missing values (geom_point).

#San Francisco
get_map(location = c(lon=cl$centers[sf,1], lat=cl$centers[sf,2]), zoom = 12, maptype = "roadmap") %>% ggmap() +
  geom_point(data = san_francisco, aes(x = start_station_longitude, y = start_station_latitude), size = 1, shape = 19) +
  ggtitle("San Francisco Ford Go Bike stations")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=37.778085,-122.407228&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en-EN&key=xxx-mUUagmuvZ5bE

Bike stations in the Bay Area

tic()

#I chose hayward to better capture San Jose
bayarea <- get_map(location = "hayward") 
## Source : https://maps.googleapis.com/maps/api/staticmap?center=hayward&zoom=10&size=640x640&scale=2&maptype=terrain&language=en-EN&key=xxx-mUUagmuvZ5bE
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=hayward&key=xxx-mUUagmuvZ5bE
ggmap(bayarea) +
  geom_point(data = fordgobike2018, aes(x = start_station_longitude, y = start_station_latitude, color  = clust, alpha = 0.1), size = 1, shape = 19) +
  ggtitle("Bay Area Ford Go Bike stations")
## Warning: Removed 262 rows containing missing values (geom_point).

toc()
## 69.024 sec elapsed

Gender of users in the Bay Area, and then by city (Oakland = 1, San Jose = 2, San Francisco = 3)

#Duration Distribution by Gender
fordgobike2018 %>% ggplot(aes(x = member_gender, y = duration_sec/1000000)) + geom_bar(stat = "Identity") +
  ylab("Duration (in Million Sec)") +
  xlab("Gender") +
  ggtitle("Duration Distribution By Gender")

#Duration Distribution by Gender for each city
fordgobike2018 %>% 
  mutate(clust = 
           ifelse(clust %in% closest.cluster(c(-122.2711, 37.8044)),
                  "Oakland", 
                  ifelse(clust %in% closest.cluster(c(-121.8953, 37.34168)), "San Jose", 
                        "San Francisco"))) %>%
  ggplot(aes(x=member_gender, y=duration_sec/1000000)) +
  geom_bar(stat="Identity") +
  ggtitle("Bay Area") +
  ylab("Duration (in Million Sec)") +
  xlab("Gender") +
  ggtitle("Duration Distribution by Gender for Each City") +
  facet_grid(clust ~ .)

Plot the density histograms of ride durations in the Bay Area by gender

#density histograms of ride durations
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + 
  scale_x_continuous(limits = c(0, 10000)) +
  geom_histogram() +
  geom_density(aes(y=..density..)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5581 rows containing non-finite values (stat_bin).
## Warning: Removed 5581 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing missing values (geom_bar).

#density histograms of ride durations logged
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
  geom_histogram() +
  geom_density(aes(y=..density..)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#density histograms of ride durations by gender
fordgobike2018 %>% ggplot(aes(x=duration_sec, y=..density..)) + 
  scale_x_continuous(limits = c(0, 10000)) +
  geom_histogram() + 
  geom_density(aes(y=..density..)) +
  facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5581 rows containing non-finite values (stat_bin).
## Warning: Removed 5581 rows containing non-finite values (stat_density).
## Warning: Removed 8 rows containing missing values (geom_bar).

#density histograms of ride durations by gender logged
fordgobike2018 %>% ggplot(aes(log(x=duration_sec), y=..density..)) +
  geom_histogram() + 
  geom_density(aes(y=..density..)) +
  facet_grid(member_gender ~ .)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary by gender for each city

#Oak
fordgobike2018 %>% filter(clust == 1) %>% 
  group_by( member_gender ) %>%
  summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))
#SJ
fordgobike2018 %>% filter(clust == 2) %>% 
  group_by( member_gender ) %>%
  summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))
#SF
fordgobike2018 %>% filter(clust == 3) %>% 
  group_by( member_gender ) %>%
  summarize(dur_mean = mean(duration_sec), dur_sd = sd(duration_sec))